elements from the web page and converts them into a CSV format.
Import Movies
movies <- read_csv("../datasets/movies.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## ID = col_double(),
## Title = col_character(),
## Year = col_double(),
## Age = col_character(),
## IMDb = col_double(),
## `Rotten Tomatoes` = col_character(),
## Netflix = col_double(),
## Hulu = col_double(),
## `Prime Video` = col_double(),
## `Disney+` = col_double(),
## Type = col_double(),
## Directors = col_character(),
## Genres = col_character(),
## Country = col_character(),
## Language = col_character(),
## Runtime = col_double()
## )
movies <- transmute(movies,
Title = Title,
Genre = str_replace_all(Genres, ",", " / "),
Year = Year,
Age = Age,
Runtime = Runtime,
IMDb = IMDb,
`Rotten Tomatoes` = `Rotten Tomatoes`,
Netflix = Netflix > 0,
Hulu = Hulu > 0,
`Prime Video` = `Prime Video` > 0,
`Disney+` = `Disney+` > 0,
`Apple TV+` = FALSE,
Director = str_replace_all(Directors, ",", " / "),
Country = str_replace_all(Country, ",", " / "),
Language = str_replace_all(Language, ",", " / "),
Format = parse_factor("Film", levels = formats),
Status = parse_factor("Ended", levels = statuses))
movies
## # A tibble: 16,744 x 17
## Title Genre Year Age Runtime IMDb `Rotten Tomatoe… Netflix Hulu
## <chr> <chr> <dbl> <chr> <dbl> <dbl> <chr> <lgl> <lgl>
## 1 Ince… Acti… 2010 13+ 148 8.8 87% TRUE FALSE
## 2 The … Acti… 1999 18+ 136 8.7 87% TRUE FALSE
## 3 Aven… Acti… 2018 13+ 149 8.5 84% TRUE FALSE
## 4 Back… Adve… 1985 7+ 116 8.5 96% TRUE FALSE
## 5 The … West… 1966 18+ 161 8.8 97% TRUE FALSE
## 6 Spid… Anim… 2018 7+ 117 8.4 97% TRUE FALSE
## 7 The … Biog… 2002 18+ 150 8.5 95% TRUE FALSE
## 8 Djan… Dram… 2012 18+ 165 8.4 87% TRUE FALSE
## 9 Raid… Acti… 1981 7+ 115 8.4 95% TRUE FALSE
## 10 Ingl… Adve… 2009 18+ 153 8.3 89% TRUE FALSE
## # … with 16,734 more rows, and 8 more variables: `Prime Video` <lgl>,
## # `Disney+` <lgl>, `Apple TV+` <lgl>, Director <chr>, Country <chr>,
## # Language <chr>, Format <fct>, Status <fct>
Join with Shows
content <- full_join(shows, natural_joined, by = "Title")
content <- mutate(content, `Year.x` = as.integer(ifelse(is.na(`Year.x`), `Year.y`, `Year.x`)))
content <- mutate(content, `Age.x` = ifelse(is.na(`Age.x`), `Age.y`, `Age.x`))
content <- mutate(content, `IMDb.x` = ifelse(is.na(`IMDb.x`), `IMDb.y`, `IMDb.x`))
content <- mutate(content, `Rotten Tomatoes.x` = ifelse(is.na(`Rotten Tomatoes.x`), `Rotten Tomatoes.y`, `Rotten Tomatoes.x`))
content <- mutate(content, `Netflix.x` = ifelse(is.na(`Netflix.x`), `Netflix.y`, `Netflix.x`))
content <- mutate(content, `Hulu.x` = ifelse(is.na(`Hulu.x`), `Hulu.y`, `Hulu.x`))
content <- mutate(content, `Prime Video.x` = ifelse(is.na(`Prime Video.x`), `Prime Video.y`, `Prime Video.x`))
content <- mutate(content, `Disney+.x` = ifelse(is.na(`Disney+.x`), `Disney+.y`, `Disney+.x`))
content <- mutate(content, `Apple TV+.x` = ifelse(is.na(`Apple TV+.x`), `Apple TV+.y`, `Apple TV+.x`))
content <- mutate(content, `Format.x` = as.integer(ifelse(is.na(`Format.x`), `Format.y`, `Format.x`)))
content <- mutate(content, `Status.x` = as.integer(ifelse(is.na(`Status.x`), `Status.y`, `Status.x`)))
content <- content[, !(names(content) %in% c("Year.y", "Age.y", "IMDb.y", "Rotten Tomatoes.y", "Netflix.y", "Hulu.y", "Prime Video.y", "Disney+.y", "Apple TV+.y", "Format.y", "Status.y"))]
content <- rename(content, Year = `Year.x`, Age = `Age.x`, IMDb = `IMDb.x`, `Rotten Tomatoes` = `Rotten Tomatoes.x`, Netflix = `Netflix.x`, Hulu = `Hulu.x`, `Prime Video` = `Prime Video.x`, `Disney+` = `Disney+.x`, `Apple TV+` = `Apple TV+.x`, Format = `Format.x`, Status = `Status.x`)
content <- mutate(content, Month = as.integer(ifelse(is.na(Premiere), NA, as.numeric(format(Premiere, "%m")))))
content <- mutate(content, Day = as.integer(ifelse(is.na(Premiere), NA, as.numeric(format(Premiere, "%d")))))
content <- mutate(content, Age = parse_factor(ifelse(is.na(Age), "unknown", Age), levels = ages))
content <- mutate(content, Origin = parse_factor(origins[ifelse(is.na(Origin), parse_factor("Distribution", levels = origins), Origin)], levels = origins))
content <- mutate(content, Format = parse_factor(formats[Format], levels = formats))
content <- mutate(content, Status = parse_factor(statuses[Status], levels = statuses))
content <- mutate(content, `Rotten Tomatoes` = as.integer(str_replace(`Rotten Tomatoes`, "%", "")))
content <- mutate(content, Year = ifelse(is.na(Year), ifelse(is.na(Premiere), NA, lubridate::year(Premiere)), Year))
(content <- content[, !(names(content) %in% c("Premiere"))])
## # A tibble: 24,532 x 25
## Title Year Age IMDb `Rotten Tomatoe… Netflix Hulu `Prime Video`
## <chr> <dbl> <fct> <dbl> <int> <lgl> <lgl> <lgl>
## 1 Brea… 2008 18+ 9.5 96 TRUE FALSE FALSE
## 2 Stra… 2016 16+ 8.8 93 TRUE FALSE FALSE
## 3 Mone… 2017 18+ 8.4 91 TRUE FALSE FALSE
## 4 Sher… 2010 16+ 9.1 78 TRUE FALSE FALSE
## 5 Bett… 2015 18+ 8.7 97 TRUE FALSE FALSE
## 6 The … 2005 16+ 8.9 81 TRUE FALSE FALSE
## 7 Blac… 2011 18+ 8.8 83 TRUE FALSE FALSE
## 8 Supe… 2005 16+ 8.4 93 TRUE FALSE FALSE
## 9 Peak… 2013 18+ 8.8 92 TRUE FALSE FALSE
## 10 Avat… 2005 7+ 9.2 100 TRUE FALSE FALSE
## # … with 24,522 more rows, and 17 more variables: `Disney+` <lgl>, `Apple
## # TV+` <lgl>, Format <fct>, Status <fct>, Genre <chr>, Director <chr>,
## # Country <chr>, Language <chr>, Episodes <int>, Seasons <int>,
## # Min_Time <int>, Max_Time <int>, Service <fct>, Origin <fct>, Network <fct>,
## # Month <int>, Day <int>
Add missing Apple TV+ original age ratings
apple_ages <- read_csv("../datasets/apple_ages.csv")
## Parsed with column specification:
## cols(
## Title = col_character(),
## Age = col_character()
## )
apple_ages <- mutate(apple_ages, Age = parse_factor(Age, levels = ages))
content <- full_join(content, apple_ages, by = "Title")
content <- mutate(content, `Age.x` = parse_factor(ages[ifelse(is.na(`Age.x`) | `Age.x` == "unknown", parse_factor(ifelse(is.na(`Age.y`), "unknown", `Age.y`)), `Age.x`)], levels = ages))
content <- content[, !(names(content) %in% c("Age.y"))]
content <- rename(content, Age = `Age.x`)
rm("apple_ages")
content <- mutate(content, Age = parse_factor(ages[ifelse(Age == "7+", parse_factor("6+", levels = ages), Age)], levels = ages))
content <- mutate(content, Age = parse_factor(ages[ifelse(Age == "13+", parse_factor("12+", levels = ages), Age)], levels = ages))
(content <- mutate(content, Age = parse_factor(ages[ifelse(Age == "all", parse_factor("0+", levels = ages), Age)], levels = ages)))
## # A tibble: 24,536 x 25
## Title Year Age IMDb `Rotten Tomatoe… Netflix Hulu `Prime Video`
## <chr> <dbl> <fct> <dbl> <int> <lgl> <lgl> <lgl>
## 1 Brea… 2008 18+ 9.5 96 TRUE FALSE FALSE
## 2 Stra… 2016 16+ 8.8 93 TRUE FALSE FALSE
## 3 Mone… 2017 18+ 8.4 91 TRUE FALSE FALSE
## 4 Sher… 2010 16+ 9.1 78 TRUE FALSE FALSE
## 5 Bett… 2015 18+ 8.7 97 TRUE FALSE FALSE
## 6 The … 2005 16+ 8.9 81 TRUE FALSE FALSE
## 7 Blac… 2011 18+ 8.8 83 TRUE FALSE FALSE
## 8 Supe… 2005 16+ 8.4 93 TRUE FALSE FALSE
## 9 Peak… 2013 18+ 8.8 92 TRUE FALSE FALSE
## 10 Avat… 2005 6+ 9.2 100 TRUE FALSE FALSE
## # … with 24,526 more rows, and 17 more variables: `Disney+` <lgl>, `Apple
## # TV+` <lgl>, Format <fct>, Status <fct>, Genre <chr>, Director <chr>,
## # Country <chr>, Language <chr>, Episodes <int>, Seasons <int>,
## # Min_Time <int>, Max_Time <int>, Service <fct>, Origin <fct>, Network <fct>,
## # Month <int>, Day <int>
Data Exploration
Production of original content throughout the years
To see how the production of original content has evolved over the years, we wanted to take a look at how many productions have been produced in each year. The assumption we want to check against is that there is more original content produced by the streaming services over time, since they want to avoid paying fees for the licenses to provide all the movies and series.
To make the following plots easier to read, we picked colors from the streaming services’ logos, so that the readers can identify the streaming services more naturally.
# Adding a column to sort by, will be used as a categorical variable later on
prem_year <- filter(mutate(original_content, Premiere_Year = lubridate::year(Premiere)), Premiere_Year != 2021, !is.na(Premiere_Year))
original_colors <- c(rgb(229/255, 9/255, 20/255, 1),
rgb(0/255, 163/255, 218/255, 1),
rgb(28/255, 231/255, 131/255, 1),
rgb(148 / 255, 31 / 255, 138 / 255, 1),
rgb(0, 0, 0, 1)
)
ggplot(data = prem_year, mapping = aes(x = Premiere_Year, fill = as.factor(Service))) +
geom_bar() +
theme(title = element_text(size = 14), axis.title = element_text(size = 12), legend.title = element_text(size = 12), legend.text = element_text(size = 10)) +
labs(fill = "Premiere Year", title = "Release Years", x = "Release Year", y = "Number of movies and series") +
scale_fill_manual(values = original_colors)
Linear regression model for the amount of original content until 2050
years <- c(2005:2020)
productions <- function(years, dataframe) {
productions <- c()
for (year in years) {
productions <- c(productions, nrow(filter(select(dataframe, Premiere_Year), Premiere_Year == year)))
}
return(productions)
}
linear_regression <- lm(years ~ productions(years, prem_year), data = filter(prem_year, is.na(match(Premiere_Year, years))))
summary(linear_regression)
##
## Call:
## lm(formula = years ~ productions(years, prem_year), data = filter(prem_year,
## is.na(match(Premiere_Year, years))))
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1336 -2.1853 0.2464 2.2166 3.8083
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.010e+03 8.509e-01 2362.446 < 2e-16 ***
## productions(years, prem_year) 1.533e-02 2.939e-03 5.217 0.00013 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.872 on 14 degrees of freedom
## Multiple R-squared: 0.6603, Adjusted R-squared: 0.6361
## F-statistic: 27.22 on 1 and 14 DF, p-value: 0.0001304
plot(linear_regression)




Change in Netflix content origin over the years
netflix_origin_change <- content %>%
filter(Year >= 2007, Netflix == TRUE) %>%
group_by(Year, Origin) %>%
summarise(n = n()) %>%
mutate(percentage = n / sum(n))
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
ggplot(netflix_origin_change, mapping = aes(x = Year, y = percentage, fill = factor(Origin))) +
geom_area()

Change in Amazon Prime Video content origin over the years
amazon_origin_change <- content %>%
filter(Year >= 2006, `Prime Video` == TRUE) %>%
group_by(Year, Origin) %>%
summarise(n = n()) %>%
mutate(Percentage = n / sum(n))
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
ggplot(amazon_origin_change, mapping = aes(x = Year, y = Percentage, fill = factor(Origin))) +
geom_area()

Change in Hulu content origin over the years
hulu_origin_change <- content %>%
filter(Year >= 2007, `Hulu` == TRUE) %>%
group_by(Year, Origin) %>%
summarise(n = n()) %>%
mutate(Percentage = n / sum(n))
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
ggplot(hulu_origin_change, mapping = aes(x = Year, y = Percentage, fill = factor(Origin))) +
geom_area()

Change in Disney+ content origin over the years
disney_origin_change <- content %>%
filter(Year >= 2019, `Disney+` == TRUE) %>%
group_by(Year, Origin) %>%
summarise(n = n()) %>%
mutate(Percentage = n / sum(n))
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
ggplot(disney_origin_change, mapping = aes(x = Year, y = Percentage, fill = factor(Origin))) +
geom_area()

#scale_x_discrete("Year", limits = c(2019.00, 2019.25, 2019.50, 2019.75, 2020.00), labels = c("2019 Q1", "2019 Q2", "2019 Q3", "2019 Q4", "2020 Q1"))
Change in Apple TV+ content origin over the years
apple_origin_change <- content %>%
filter(Year >= 2019, `Apple TV+` == TRUE) %>%
group_by(Year, Origin) %>%
summarise(n = n()) %>%
mutate(Percentage = n / sum(n))
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
ggplot(apple_origin_change, mapping = aes(x = Year, y = Percentage, fill = factor(Origin))) +
geom_area()

Production of original content grouped by months
In society there is a common agreement, that most of the interesting movies and series get released during the autumn and winter months, so we made sure to check whether this claim holds up to be true for original content as well.
prem_month <- filter(mutate(original_content,
Premiere_Month = as.character(lubridate::month(lubridate::ymd(010101) +
months(lubridate::month(Premiere) - 1), label = TRUE, abbr = TRUE))),
lubridate::year(Premiere) != 2021, !is.na(lubridate::year(Premiere)))
ggplot(data = prem_month, mapping = aes(x = reorder(Premiere_Month, lubridate::month(Premiere)),fill = as.factor(Service))) +
geom_bar(width = 0.85) +
theme(title = element_text(size = 14), axis.title = element_text(size = 12), legend.title = element_text(size = 12), legend.text = element_text(size = 10)) +
labs(fill = "Premiere Month", title = "Release Months", x = "Release Month", y = "Number of movies and series") +
scale_fill_manual(values = original_colors)
From our feeling the amount of series compared to movies has also increased steadily, so we figured out that it is interesting to take a look at how the ratio evolved throughout the years for the various streaming services. We first started off by creating two separate graphs that display us the amount of movies or series grouped by the streaming services, but as you can see, it does not really help the situation since there is a general tendency that the streaming services produce more original content either way. This has lead to the approach that is displayed in the last of the three graphs. We set out that 50% is the baseline for the case that there are no movies and series produced. From there on we can determine what the ratio of series produced to the number of produced movies is. For Netflix for example, it is realistic to say that there is trend that can be recognized from the data that has been gathered so far.
series_and_movies <- as_tibble(filter(prem_year %>%
group_by(Service, Premiere_Year = strtoi(Premiere_Year), Format) %>%
summarise(Entries = length(Service)), Format == "Film" | Format == "Series"))
## `summarise()` regrouping output by 'Service', 'Premiere_Year' (override with `.groups` argument)
series_and_movies
## # A tibble: 54 x 4
## Service Premiere_Year Format Entries
## <fct> <int> <fct> <int>
## 1 Netflix 2005 Film 1
## 2 Netflix 2012 Series 1
## 3 Netflix 2012 Film 2
## 4 Netflix 2013 Series 6
## 5 Netflix 2013 Film 8
## 6 Netflix 2014 Series 7
## 7 Netflix 2014 Film 12
## 8 Netflix 2015 Series 28
## 9 Netflix 2015 Film 24
## 10 Netflix 2016 Series 57
## # … with 44 more rows
percentage_of_series <- function(dataframe) {
series_percentages <- data.frame(Service = c(), Premiere_Year = c(), Series_Percentage = c())
years <- c(2000:2020)
for (service in services) {
service_rows <- filter(dataframe, Service == service)
for (year in years) {
nr_series <- sum(pull(filter(service_rows, Format == "Series", Premiere_Year == year), Entries))
nr_films <- sum(pull(filter(service_rows, Format == "Film", Premiere_Year == year), Entries))
series_percentages <- rbind(series_percentages,
data.frame(Service = as.factor(c(service)), Premiere_Year = c(year), Percentage = c(ifelse(nr_series == 0,
ifelse(nr_films == 0, 50, 0), ifelse(nr_films == 0, 100, nr_series / (nr_series + nr_films) * 100)))))
}
}
return(series_percentages)
}
percentage_of_series(series_and_movies)
## Service Premiere_Year Percentage
## 1 Netflix 2000 50.00000
## 2 Netflix 2001 50.00000
## 3 Netflix 2002 50.00000
## 4 Netflix 2003 50.00000
## 5 Netflix 2004 50.00000
## 6 Netflix 2005 0.00000
## 7 Netflix 2006 50.00000
## 8 Netflix 2007 50.00000
## 9 Netflix 2008 50.00000
## 10 Netflix 2009 50.00000
## 11 Netflix 2010 50.00000
## 12 Netflix 2011 50.00000
## 13 Netflix 2012 33.33333
## 14 Netflix 2013 42.85714
## 15 Netflix 2014 36.84211
## 16 Netflix 2015 53.84615
## 17 Netflix 2016 41.00719
## 18 Netflix 2017 31.96721
## 19 Netflix 2018 36.56174
## 20 Netflix 2019 43.47826
## 21 Netflix 2020 47.28370
## 22 Amazon Prime Video 2000 50.00000
## 23 Amazon Prime Video 2001 50.00000
## 24 Amazon Prime Video 2002 50.00000
## 25 Amazon Prime Video 2003 50.00000
## 26 Amazon Prime Video 2004 50.00000
## 27 Amazon Prime Video 2005 50.00000
## 28 Amazon Prime Video 2006 50.00000
## 29 Amazon Prime Video 2007 50.00000
## 30 Amazon Prime Video 2008 50.00000
## 31 Amazon Prime Video 2009 50.00000
## 32 Amazon Prime Video 2010 50.00000
## 33 Amazon Prime Video 2011 50.00000
## 34 Amazon Prime Video 2012 50.00000
## 35 Amazon Prime Video 2013 100.00000
## 36 Amazon Prime Video 2014 100.00000
## 37 Amazon Prime Video 2015 100.00000
## 38 Amazon Prime Video 2016 100.00000
## 39 Amazon Prime Video 2017 97.43590
## 40 Amazon Prime Video 2018 93.47826
## 41 Amazon Prime Video 2019 85.36585
## 42 Amazon Prime Video 2020 58.09524
## 43 Hulu 2000 50.00000
## 44 Hulu 2001 50.00000
## 45 Hulu 2002 50.00000
## 46 Hulu 2003 50.00000
## 47 Hulu 2004 50.00000
## 48 Hulu 2005 50.00000
## 49 Hulu 2006 50.00000
## 50 Hulu 2007 50.00000
## 51 Hulu 2008 50.00000
## 52 Hulu 2009 50.00000
## 53 Hulu 2010 100.00000
## 54 Hulu 2011 100.00000
## 55 Hulu 2012 100.00000
## 56 Hulu 2013 100.00000
## 57 Hulu 2014 100.00000
## 58 Hulu 2015 100.00000
## 59 Hulu 2016 100.00000
## 60 Hulu 2017 68.75000
## 61 Hulu 2018 70.58824
## 62 Hulu 2019 72.00000
## 63 Hulu 2020 71.79487
## 64 Disney+ 2000 50.00000
## 65 Disney+ 2001 50.00000
## 66 Disney+ 2002 50.00000
## 67 Disney+ 2003 50.00000
## 68 Disney+ 2004 50.00000
## 69 Disney+ 2005 50.00000
## 70 Disney+ 2006 50.00000
## 71 Disney+ 2007 50.00000
## 72 Disney+ 2008 50.00000
## 73 Disney+ 2009 50.00000
## 74 Disney+ 2010 50.00000
## 75 Disney+ 2011 50.00000
## 76 Disney+ 2012 50.00000
## 77 Disney+ 2013 50.00000
## 78 Disney+ 2014 50.00000
## 79 Disney+ 2015 50.00000
## 80 Disney+ 2016 50.00000
## 81 Disney+ 2017 50.00000
## 82 Disney+ 2018 50.00000
## 83 Disney+ 2019 68.42105
## 84 Disney+ 2020 49.18033
## 85 Apple TV+ 2000 50.00000
## 86 Apple TV+ 2001 50.00000
## 87 Apple TV+ 2002 50.00000
## 88 Apple TV+ 2003 50.00000
## 89 Apple TV+ 2004 50.00000
## 90 Apple TV+ 2005 50.00000
## 91 Apple TV+ 2006 50.00000
## 92 Apple TV+ 2007 50.00000
## 93 Apple TV+ 2008 50.00000
## 94 Apple TV+ 2009 50.00000
## 95 Apple TV+ 2010 50.00000
## 96 Apple TV+ 2011 50.00000
## 97 Apple TV+ 2012 50.00000
## 98 Apple TV+ 2013 50.00000
## 99 Apple TV+ 2014 50.00000
## 100 Apple TV+ 2015 50.00000
## 101 Apple TV+ 2016 50.00000
## 102 Apple TV+ 2017 50.00000
## 103 Apple TV+ 2018 50.00000
## 104 Apple TV+ 2019 83.33333
## 105 Apple TV+ 2020 72.72727
ggplot() +
geom_line(data = filter(series_and_movies, Format == "Film"), mapping = aes(x = Premiere_Year, y = Entries, colour = as.factor(Service)), label = "Movie") +
labs(colour = "Streaming Services", title = "Amount of released Movies by streaming services", x = "Year", y = "Number of movies") +
scale_colour_manual(values = original_colors)
## Warning: Ignoring unknown parameters: label

ggplot() +
geom_line(data = filter(series_and_movies, Format == "Series"), mapping = aes(x = Premiere_Year, y = Entries, colour = as.factor(Service)), label = "Movie") +
labs(colour = "Streaming Services", title = "Amount of released Series by streaming services", x = "Year", y = "Number of series") +
scale_colour_manual(values = original_colors)
## Warning: Ignoring unknown parameters: label

ggplot() +
geom_line(data = percentage_of_series(series_and_movies), mapping = aes(x = Premiere_Year, y = Percentage, colour = as.factor(Service))) +
theme(title = element_text(size = 14), axis.title = element_text(size = 12), legend.title = element_text(size = 12), legend.text = element_text(size = 10)) +
labs(colour = "Streaming Services", title = "% of series compared to movies", x = "Release Year", y = "% of series") +
scale_color_manual(values = original_colors)

Genre counter
genre_string <- paste(original_content$Genre, collapse = "/")
genre_vector <- strsplit(genre_string, "/")[[1]]
genre_vector_clean <- gsub(" ", "", genre_vector)
genre_frame = transmute(data.frame(table(genre_vector_clean)),
Genre = genre_vector_clean,
Count = Freq)
top_x_genres <- function(dataframe, x) {
arranged_df <- data.frame(Genre = arrange(dataframe, desc(Count))$Genre, Entries = arrange(dataframe, desc(Count))$Count)
top_x <- data.frame(Genre = c(), Entries = c())
for (i in 1:x) {
top_x <- rbind(top_x, arranged_df[i, ])
}
return(top_x)
}
genre_count <- as.tibble(top_x_genres(genre_frame, 20) %>% summarise(Genre, Entries = strtoi(Entries)))
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
ggplot(data = genre_count) +
geom_bar(mapping = aes(x = reorder(Genre, desc(Entries)), y = Entries, fill = Entries), stat = "identity", show.legend = FALSE) +
theme(title = element_text(size = 14), axis.title.x = element_text(size = 12), axis.text.y = element_text(size = 11)) +
labs(title = "Top 20 Genres - Original Content", x = "", y = "No. of original content productions") +
coord_flip()

Age rating for all content
ggplot(data = content) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 1121 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Format == "Series")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 603 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Format == "Film")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 498 rows containing non-finite values (stat_bin).
### Age rating for all Netflix content
ggplot(data = filter(content, Netflix == TRUE)) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

ggplot(data = filter(content, Netflix == TRUE, Format == "Series")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

ggplot(data = filter(content, Netflix == TRUE, Format == "Film")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

Age rating for all original Netflix content
ggplot(data = filter(content, Service == "Netflix", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 725 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Service == "Netflix", Format == "Series", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 301 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Service == "Netflix", Format == "Film", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 416 rows containing non-finite values (stat_bin).

Age rating for all Amazon Prime Video content
ggplot(data = filter(content, `Prime Video` == TRUE)) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

ggplot(data = filter(content, `Prime Video` == TRUE, Format == "Series")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

ggplot(data = filter(content, `Prime Video` == TRUE, Format == "Film")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

Age rating for all original Amazon Prime Video content
ggplot(data = filter(content, Service == "Amazon Prime Video", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 120 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Service == "Amazon Prime Video", Format == "Series", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 110 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Service == "Amazon Prime Video", Format == "Film", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 10 rows containing non-finite values (stat_bin).

Age rating for all Hulu content
ggplot(data = filter(content, Hulu == TRUE)) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

ggplot(data = filter(content, Hulu == TRUE, Format == "Series")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

ggplot(data = filter(content, Hulu == TRUE, Format == "Film")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

Age rating for all original Hulu content
ggplot(data = filter(content, Service == "Hulu", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 60 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Service == "Hulu", Format == "Series", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 52 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Service == "Hulu", Format == "Film", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 8 rows containing non-finite values (stat_bin).

Age rating for all Disney+ content
ggplot(data = filter(content, `Disney+` == TRUE)) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

ggplot(data = filter(content, `Disney+` == TRUE, Format == "Series")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

ggplot(data = filter(content, `Disney+` == TRUE, Format == "Film")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")

Age rating for all original Disney+ content
ggplot(data = filter(content, Service == "Disney+", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 89 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Service == "Disney+", Format == "Series", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 55 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Service == "Disney+", Format == "Film", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 30 rows containing non-finite values (stat_bin).

Age rating for all Apple TV+ content
ggplot(data = filter(content, `Apple TV+` == TRUE)) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 66 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, `Apple TV+` == TRUE, Format == "Series")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 52 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, `Apple TV+` == TRUE, Format == "Film")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 10 rows containing non-finite values (stat_bin).

Age rating for all original Apple TV+ content
ggplot(data = filter(content, Service == "Apple TV+", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 66 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Service == "Apple TV+", Format == "Series", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 52 rows containing non-finite values (stat_bin).

ggplot(data = filter(content, Service == "Apple TV+", Format == "Film", Origin == "Original")) +
geom_histogram(mapping = aes(x = Year, fill = factor(Age)), binwidth = 1) +
labs(fill = "Age")
## Warning: Removed 10 rows containing non-finite values (stat_bin).

Exporting the data frame to CSV
In order to make it reusable for other people, we also wanted to export our final data frame for following usage. We are also planning to publish this data set to Kaggle.
write.csv(original_content, "../original_content.csv", row.names = TRUE)